Class 03

Packages

Packages (there’s a lot!)

library(tidyverse)
library(lmtest)  # robust SEs 
library(sandwich) 
library(countrycode) # for matching country names
library(dataverse)   # for accessing replication data on the dataverse
library(WDI)         # World Developement indicators API 
library(ggdist)      # Distribution plots
library(marginaleffects)  # marginal effects estimation tools 
library(modelsummary)
library(performance)

Also, if you don’t have it already, you’ll want to install the vdemdata package. This provides easy access to the latest version of the Varieties of Democracy Dataset :

devtools::install_github("vdeminstitute/vdemdata")

Preparing the data

First, we need to do a little data set construction. This is messy, but rather than leave it out I figured I would show it here to give a sense of the process.

We’ll pull from three difference data sources:

  • Varieties of Democracy (V-DEM) for global data on democracy and governance (codebook)

  • World Bank Data world (information) (R package info) for economic indicators

  • Global Legislators Data (dataverse) (codebook) for information about the ages of legislators

# get global legislators data 
gld <- get_dataframe_by_name(
  filename = "global_legislator_database.tab",
  dataset = "10.7910/DVN/U1ZNVT", 
  server = "dataverse.harvard.edu")


# aggregate to get average age for each legislature 
gld_agg <- gld |>
  group_by(country_name, year_of_election) |>
  summarise(
    nleg = n(), 
    mean_age = mean(year_of_election - year(dob), na.rm = T)
  )|>
  # add VDEM matching code 
  mutate(vdem_code =   countrycode(country_name, origin = 'country.name', destination = 'vdem'))


# get vdem data. Filter by year and select usefil columns 
vdem<-vdemdata::vdem|>
  filter(year>=2010)|>
  select(year, country_id, e_pelifeex, v2elpubfin,v2xnp_client,e_regionpol, v2elparlel, v2x_corr, v2x_polyarchy)|>
  mutate(electoral_system = factor(v2elparlel, labels=c("Majoritarian", "Proportional", "Mixed","Other")),
         
         )|>
  rename(public_finance = v2elpubfin)


# get data from the world bank WDI 
wdi_data<-WDI(indicator=c('gdp_pcap'= 'NY.GDP.PCAP.KD',
                          'gini' = 'SI.POV.GINI',
                          'life_expectancy'  = 'SP.DYN.LE00.IN',
                          'pop_over_65'= 'SP.POP.65UP.TO.ZS'), 
                start=2010, end=2023
)


# Add a vdem matching code
wdi_data<-wdi_data|>
  mutate(vdem_code =   countrycode(country, origin='country.name', destination='vdem'))
  

# Join everything on country and year. inner_join means we're dropping anything 
# without a match, be careful with this (left joins are often better)

gld_joined<-gld_agg|>
  inner_join(vdem, by=join_by(vdem_code == country_id, year_of_election == year))|>
  inner_join(wdi_data, by = join_by(vdem_code == vdem_code, year_of_election == year) )

Research Question

The U.S. has a lot of old people in congress. Why is this? How do we compare to other states? And are their structural features that might explain why we’ve got so many people born before the invention of commercially viable color television in positions of power?

gld|>
  filter(country_name=="United States")|>
  ggplot(aes(x=dob)) +
  geom_density(fill='lightblue', alpha=.8) +
  geom_vline(xintercept = as.Date("1954-12-17"), lty=2) +
  annotate("text", x=as.Date("1957-12-31") + 1500, y=.000002, label="invention of color tv") +
  theme_bw() +
  xlab("Birthdates of members of Congress") +
  ylab("")

Model output

We’ll run three slightly different models and present them in a single table.

gof_stats <- c(
  "nobs", "aic", "bic", "r.squared"
)

coef_names <- c("life_expectancy" = "Life expectancy",
                "gdp_pcap" = "GDP per-capita",
                "log(gdp_pcap)" = "Logged GDP per-capita",
                "public_finance" = "Public financing of elections"
                )

model_0<-lm(mean_age ~ life_expectancy + gdp_pcap,
          data=gld_joined)
model_1<-lm(mean_age ~ life_expectancy + log(gdp_pcap), data =gld_joined)
model_2<-lm(mean_age ~ life_expectancy + log(gdp_pcap) + public_finance , data =gld_joined)



mlist<-list(
  'Baseline model' = model_0,
  'Logged GDP model' = model_1,
  'Public financing' = model_2
  )


modelsummary(mlist,
       gof_map = gof_stats,
       coef_rename = coef_names,
       coef_omit = "Intercept"
       
       )
tinytable_zrbunh9tstl75m0dptwa
Baseline model Logged GDP model Public financing
Life expectancy -0.114 -0.241 -0.177
(0.063) (0.090) (0.087)
GDP per-capita 0.000
(0.000)
Logged GDP per-capita 1.156 1.474
(0.537) (0.516)
Public financing of elections -0.905
(0.262)
Num.Obs. 94 94 94
AIC 515.6 511.7 502.0
BIC 525.7 521.9 514.7
R2 0.036 0.074 0.183

Checks

At a minimum, we should look for evidence of:

  • non-normality

  • heteroskedasticity

  • influential observations

  • non-linearity

The check_model function gives us a good starting point for identifying these issues:

check_model(model_0)

The Breusch–Pagan test gives us a way to test for “significant” heteroskedasticity by running a new linear model with the squared residuals as our dependent variable.

The null hypothesis here is homoskedastic data, and the alternative is heteroskedasticity:

squared_resid<-resid(model_0)^2
resid_model<-lm(squared_resid ~ life_expectancy + gdp_pcap, data = gld_joined)
stat<-nrow(resid_model$model) * summary(resid_model)$r.squared
1-pchisq(stat, df=2)
[1] 0.04699436

The performance package has a built-in function for running a variation of the Breusch-Pagan test which we can run with check_heteroskedasticity

Warning: Heteroscedasticity (non-constant error variance) detected (p = 0.008).

We might also want to check for evidence of multicollinearity, although this appears to be less of a problem here:

And checks for individual outliers (defined as cook’s distance greater than 0.8)

OK: No outliers detected.
- Based on the following method and threshold: cook (0.8).
- For variable: (Whole model)

Based on the analyses above, do you see evidence to support any changes to the model such as transforming variables?

Robust Standard Errors

Since we have some clear evidence of heteroskedasticity, it probably makes sense to go ahead and use heteroskedasticity robust standard errors, which relax the assumption of constant variance.

We can get heteroskedasticity consistent standard errors using the coeftest function from the lmtest package.

library(lmtest)



m_0<-coeftest(model_0, vcov = vcovHC(model_0, "HC0"))   
modelsummary(list("classic" = model_0, "robust" = m_0),
       gof_map = gof_stats,
       coef_rename = coef_names,
       coef_omit = "Intercept")
tinytable_87brgw9t41paslt6dpgj
classic robust
Life expectancy -0.114 -0.114
(0.063) (0.068)
GDP per-capita 0.000 0.000
(0.000) (0.000)
Num.Obs. 94 94
AIC 515.6 689.6
BIC 525.7 921.0
R2 0.036

The model summary package will also apply these changes directly to a list models, so I can try out different methods of calculating the standard errors and p-values for the same model:

modelsummary(model_0,
       gof_map = gof_stats,
       coef_rename = coef_names,
       vcov = c("classical", "HC0", "bootstrap")
       )
tinytable_y367e20ut0x4wjbc32x3
(1) (2) (3)
(Intercept) 57.855 57.855 57.855
(4.430) (4.943) (5.165)
Life expectancy -0.114 -0.114 -0.114
(0.063) (0.068) (0.071)
GDP per-capita 0.000 0.000 0.000
(0.000) (0.000) (0.000)
Num.Obs. 94 94 94
AIC 515.6 515.6 515.6
BIC 525.7 525.7 525.7
R2 0.036 0.036 0.036

Model Comparison

Which models are better? We can compare models using BIC/AIC (lower scores mean better models, all else equal)

compare_performance(list(model_0, model_1, model_2))

Or, if the models are nested, we can compare using F-tests

anova(model_0, model_1)
anova(model_1, model_2)

Equivalence Testing

We can see that the effect of electoral_system is not statistically significant, but what if we want to rule out the possibility that the a PR system would result in younger legislators relative to a majoritarian system?

model<-lm(mean_age ~ 
            life_expectancy + 
            public_finance +
            electoral_system, data=gld_joined)


model_robust<-coeftest(model, vcov = vcovHC(model, "HC0"))   

model_robust

t test of coefficients:

                              Estimate Std. Error t value  Pr(>|t|)    
(Intercept)                  49.048664   5.954992  8.2366 1.419e-12 ***
life_expectancy               0.028756   0.086836  0.3311    0.7413    
public_finance               -0.664078   0.327375 -2.0285    0.0455 *  
electoral_systemProportional -1.089526   1.191739 -0.9142    0.3631    
electoral_systemMixed        -0.771819   1.486539 -0.5192    0.6049    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We’ll set a minimum effect size of 0.66112. Essentially, we are attempting to rule out the possibility that the effect of proportional representation (relative to majoritarianism) is greater than the effect of public financing for elections:

min_effect <- 0.66112

broom::tidy(model_robust, conf.int = TRUE, conf.level = .90)|>
    filter(term == 'electoral_systemProportional')|>
    select(term, estimate, conf.low, conf.high)|>
   mutate(equi_lb = min_effect*-1, 
          equi_ub = min_effect)|>
  ggplot(aes(y=term)) + geom_pointrange(aes(xmin=conf.low, xmax=conf.high, x=estimate)) +
  geom_vline(aes(xintercept=equi_lb), color='red', lty=2) +
  geom_vline(aes(xintercept=equi_ub), color='red', lty=2) + 
  theme_bw()

An alternative to using heteroskedasticity consistent standard errors is to use the non-parametric bootstrap. This also allows us to generate a useful visualization of the cases where we might see effects above the minimum threshold we set above:

comp <- avg_comparisons(model,
                        variables = list("electoral_system" = c("Majoritarian", "Proportional")),
                        vcov = 'HC0')

booted<-inferences(comp, method='boot', conf_type = 'perc', R=1000)
draws<-get_draws(booted)
ggplot(draws, aes(x=draw, y='Simulated Differences'))+

   stat_halfeye(alpha=.5, .width = c(.90), aes(fill=after_stat(x>=min_effect | x<= -min_effect))) +
    geom_vline(xintercept=-min_effect, lty=2, col='black') +
    geom_vline(xintercept=min_effect, lty=2, col='black') +
  theme_bw() +
  scale_fill_brewer(palette='Dark2')+
  xlab("Simulated Differences") +
  ylab("")+
  coord_flip()

Based on these results, can we rule out an effect for proportional representation?